Load necessary packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(scales)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(treemap)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Choose a color palette
colors <- brewer.pal(9, "YlGnBu")
Load Dataset
budget_data <- read.csv('bcl14.csv')
check dimensions of data
dim(budget_data)
## [1] 270 6
Check for missing values in the entire dataset
sum(is.na(budget_data))
## [1] 0
Check for missing values in all columns
sapply(budget_data, function(x) sum(is.na(x)))
## Fund Department
## 0 0
## BCL.Code BCL.Name
## 0 0
## BCL.Purpose X2014.Expenditure.Allowance
## 0 0
duplicate check
sum(duplicated(budget_data))
## [1] 0
View structure of data
str(budget_data)
## 'data.frame': 270 obs. of 6 variables:
## $ Fund : chr "General Subfund" "General Subfund" "General Subfund" "General Subfund" ...
## $ Department : chr "Civil Service Commissions" "Department of Neighborhoods" "Department of Neighborhoods" "Department of Neighborhoods" ...
## $ BCL.Code : chr "V1CIV" "I3100" "I3200" "I3300" ...
## $ BCL.Name : chr "Civil Service Commissions" "Director's Office" "Internal Operations" "Community Building" ...
## $ BCL.Purpose : chr "The purpose of the Civil Service Commissions Budget Control Level is to provide administrative support to the P"| __truncated__ "The purpose of the Director's Office Budget Control Level is to provide executive leadership, communications, a"| __truncated__ "The purpose of the Internal Operations Budget Control Level is to provide financial, human resources, facility,"| __truncated__ "The purpose of the Community Building Budget Control Level is to deliver technical assistance, support services"| __truncated__ ...
## $ X2014.Expenditure.Allowance: num 385887 485705 1426675 3458080 5631045 ...
View summary of data
summary(budget_data)
## Fund Department BCL.Code BCL.Name
## Length:270 Length:270 Length:270 Length:270
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## BCL.Purpose X2014.Expenditure.Allowance
## Length:270 Min. : 1999
## Class :character 1st Qu.: 1309259
## Mode :character Median : 4850475
## Mean : 17326490
## 3rd Qu.: 13468717
## Max. :305641502
Correct data types
budget_data$Fund <- as.factor(budget_data$Fund)
budget_data$Department <- as.factor(budget_data$Department)
budget_data$BCL.Code <- as.factor(budget_data$BCL.Code)
budget_data$BCL.Name <- as.factor(budget_data$BCL.Name)
budget_data$BCL.Purpose <- as.factor(budget_data$BCL.Purpose)
Outlier check
boxplot(budget_data$Fund, main = "Fund", ylab = "Expenditure Allowance", col = "steelblue")
boxplot(budget_data$Department, main = "Department", ylab = "Expenditure Allowance", col = "darkgreen")
boxplot(budget_data$BCL.Code, main = "BCL Code", ylab = "Expenditure Allowance", col = "purple")
boxplot(budget_data$BCL.Name, main = "BCL Name", ylab = "Expenditure Allowance", col = "red")
boxplot(budget_data$BCL.Purpose, main = "BCL Purpose", ylab = "Expenditure Allowance", col = "orange")
boxplot(
budget_data$`X2014.Expenditure.Allowance`,
main = "2014 Expenditure Allowance",
ylab = "Expenditure Allowance",
col = "blue",
boxwex = 0.5, # Narrower box
outpch = 20, # Solid circle for outlier points
outcol = "red" # Red color for outlier points
)
Create new variable for total expenditure
budget_data$total_expenditure <- budget_data$`X2014.Expenditure.Allowance`
Summarize total expenditure by department
department_summary <- budget_data %>%
group_by(Department) %>%
summarise(total_expenditure = sum(total_expenditure)) %>%
arrange(desc(total_expenditure))
Create a vector of custom colors
my_colors <- c("steelblue", "orange", "green", "purple", "red", "blue", "yellow", "brown", "grey", "pink")
Create a bar plot of top 10 departments by total expenditure
ggplot(head(department_summary, 10), aes(x = Department, y = total_expenditure, fill = Department)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = my_colors) + # Use custom colors
xlab("Department") +
ylab("Total Expenditure") +
ggtitle("Top 10 Departments by Total Expenditure in 2014")
Summarize total expenditure by fund
fund_summary <- budget_data %>%
group_by(Fund) %>%
summarise(total_expenditure = sum(total_expenditure)) %>%
arrange(desc(total_expenditure))
Calculate the total expenditure allowance for each BCL code
expenditure_by_bcl <- budget_data %>%
group_by(BCL.Code) %>%
summarize(total_expenditure = sum(`X2014.Expenditure.Allowance`))
Create a bar plot without key
ggplot(expenditure_by_bcl, aes(x = BCL.Code, y = total_expenditure, fill = BCL.Code)) +
geom_bar(stat = "identity") +
xlab("BCL Code") +
ylab("Total Expenditure Allowance") +
ggtitle("Distribution of Expenditure Allowance by BCL Code") +
scale_fill_hue(l = 40, c = 80) + # adds a color scale
guides(fill = FALSE) # removes the legend/key
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
count unique elements
length(unique(budget_data$BCL.Code))
## [1] 250
Create the scatterplot
ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, y = Department, color = Department)) +
geom_point() +
scale_color_hue(l = 40, c = 80) + # adds a color scale
xlab("Expenditure Allowance") +
ylab("Department Count") +
ggtitle("Relationship Between Expenditure Allowance and Department Count")
Create a density plot without key
ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, fill = Department)) +
geom_density(alpha = 0.5) +
xlab("Expenditure Allowance (in R)") +
ylab("Density") +
ggtitle("Expenditure Allowance by Department") +
scale_fill_hue(l = 40, c = 80) + # adds a color scale
theme_classic() +
guides(fill = FALSE) # removes the legend/key
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
Create a summary table with counts of departments by fund and expenditure allowance
dept_summary <- budget_data %>%
group_by(Fund, `X2014.Expenditure.Allowance`) %>%
summarize(count = n())
## `summarise()` has grouped output by 'Fund'. You can override using the
## `.groups` argument.
Create a heatmap
ggplot(dept_summary, aes(x = `X2014.Expenditure.Allowance`, y = Fund, fill = count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(x = "Expenditure Allowance", y = "Fund", title = "Department Distribution by Fund and Expenditure Allowance")
Define the expenditure allowance ranges
budget_data$ranges <- cut(budget_data$X2014.Expenditure.Allowance, breaks = seq(0, 8000000, by = 1000000))
Create a summary table with counts of departments by fund and expenditure allowance range
dept_summary <- budget_data %>%
group_by(Fund, ranges) %>%
summarize(count = n())
## `summarise()` has grouped output by 'Fund'. You can override using the
## `.groups` argument.
Create a stacked bar chart
ggplot(dept_summary, aes(x = Fund, y = count, fill = ranges)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("#FF9999", "#FF6666", "#FF3333", "#FF0000", "#CC0000", "#990000", "#660000", "#330000")) +
labs(x = "Fund", y = "Number of Departments", title = "Number of Departments by Fund and Expenditure Allowance Ranges") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
highest expenditure allowance in department
expenditure_by_dept <- budget_data %>%
group_by(Department) %>%
summarize(total_expenditure = sum(`X2014.Expenditure.Allowance`)) %>%
arrange(desc(total_expenditure))
Create the bar chart
ggplot(expenditure_by_dept, aes(x = Department, y = total_expenditure, fill = total_expenditure)) +
geom_bar(stat = "identity") +
scale_fill_gradientn(colors = colors, guide = FALSE) +
xlab("Department") +
ylab("Total Expenditure Allowance") +
ggtitle("Total Expenditure Allowance by Department in 2014") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Create bubble chart
ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, y = Department, size = `X2014.Expenditure.Allowance`, color = Department)) +
geom_point(alpha = 0.7) +
scale_color_hue(l = 40, c = 80) + # adds a color scale
xlab("Expenditure Allowance (in R)") +
ylab("Department") +
ggtitle("Expenditure Allowance by Department") +
theme_classic()
Create bubble chart
ggplot(department_summary, aes(x = total_expenditure, y = Department, size = total_expenditure, color = Department)) +
geom_point(alpha = 0.7) +
scale_color_hue(l = 40, c = 80) + # adds a color scale
xlab("Total Expenditure (in R)") +
ylab("Department") +
ggtitle("Total Expenditure by Department") +
theme_classic()
Aggregate the data by BCL code and department
agg_data <- aggregate(X2014.Expenditure.Allowance ~ BCL.Code + Department, data = budget_data, sum)
Create the treemap
treemap(agg_data,
index = c("Department", "BCL.Code"),
vSize = "X2014.Expenditure.Allowance",
type = "index",
palette=brewer.pal(9, "Set1"),
title = "Seattle 2014 Endorsed Budget by BCL Code and Department")
budget_data$Fund <- as.numeric(budget_data$Fund)
budget_data$Department <- as.numeric(budget_data$Department)
plot_ly(budget_data, x = ~Fund, y = ~Department, z = ~`X2014.Expenditure.Allowance`, color = ~`X2014.Expenditure.Allowance`,
marker = list(size = 7, opacity = 0.8, colorscale = 'Viridis')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = "Fund"),
yaxis = list(title = "Department"),
zaxis = list(title = "2014 Expenditure Allowance")))